import pandas as pd
# ensure openpyxl==3.1.0, version 3.1.1 has a bug
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
import os
MajorRoadsEmissionsURL = 'https://data.london.gov.uk/download/london-atmospheric-emissions-inventory-2013/8c520de2-c518-4e64-932c-0071ac826742/LAEI2013_MajorRoads_EmissionsbyLink_2013.xlsx'
SupportingDocURL = 'https://data.london.gov.uk/download/london-atmospheric-emissions-inventory-2013/1f3755f3-dae8-4d39-b9b4-0cc7adb4e826/1.%20Supporting%20Information.zip'
LocalFileMajorRoadsEmissions = 'data/LAEI2013_MajorRoads_EmissionsbyLink_2013.xlsx'
LocalFileSupportingDoc = 'data/1. Supporting Information.zip'
# Check if data folder exists, if not create it
if not os.path.exists('data'):
os.makedirs('data')
# Check if data is already downloaded, if not set load_from_local to False
if os.path.exists(LocalFileMajorRoadsEmissions):
load_from_local = True
else:
load_from_local = False
if load_from_local:
MajorRoadsEmissionsURL = LocalFileMajorRoadsEmissions
xls = pd.ExcelFile(MajorRoadsEmissionsURL)
if load_from_local:
DocZip = ZipFile(LocalFileSupportingDoc)
else:
resp = urlopen(SupportingDocURL)
DocZip = ZipFile(BytesIO(resp.read()))
DocZip.namelist()[:10]
['1. Supporting Information/', '1. Supporting Information/1. Road Traffic Data/', '1. Supporting Information/1. Road Traffic Data/Excel/', '1. Supporting Information/1. Road Traffic Data/Excel/LAEI2013_2008_AADT-VKM.xlsx', '1. Supporting Information/1. Road Traffic Data/Excel/LAEI2013_2010_AADT-VKM.xlsx', '1. Supporting Information/1. Road Traffic Data/Excel/LAEI2013_2013_AADT-VKM.xlsx', '1. Supporting Information/1. Road Traffic Data/Excel/LAEI2013_2020_AADT-VKM.xlsx', '1. Supporting Information/1. Road Traffic Data/Excel/LAEI2013_2025_AADT-VKM.xlsx', '1. Supporting Information/1. Road Traffic Data/Excel/LAEI2013_2030_AADT-VKM.xlsx', '1. Supporting Information/1. Road Traffic Data/Excel/LAEI2013_VKM_AllYears_Summary.xlsx']
xls.sheet_names
['2013 LTS Rds', '2013 Other Major Rds']
df_LTSroads = pd.read_excel(xls, '2013 LTS Rds')
print(df_LTSroads.shape)
df_LTSroads.head()
(366220, 32)
| GridId | Toid | GRID_ExactCut_ID | Location_ExactCut | BoroughName_ExactCut | Lts | Length (m) | Emissions | Year | Pollutant | ... | Artic5Axle | Artic6Axle | PetrolCar | DieselCar | PetrolLgv | DieselLgv | LtBus | Coach | ElectricCar | ElectricLgv | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6253 | 4000000027908919 | 24 | External | NonGLA | 18898 | 50.761449 | DFT | 2013 | CO2 | ... | 0.241372 | 0.190560 | 8.761443 | 4.810774 | 0.037550 | 1.735121 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | 6253 | 4000000027947931 | 24 | External | NonGLA | 18895 | 28.592125 | DFT | 2013 | CO2 | ... | 0.000000 | 0.000000 | 0.015535 | 0.008576 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 6253 | 4000000028013383 | 24 | External | NonGLA | 15816 | 5.101391 | DFT | 2013 | CO2 | ... | 0.027271 | 0.021509 | 0.939028 | 0.518684 | 0.004055 | 0.184415 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | 6253 | 4000000028025820 | 24 | External | NonGLA | 15816 | 3.757501 | DFT | 2013 | CO2 | ... | 0.020087 | 0.015843 | 0.691654 | 0.382044 | 0.002987 | 0.135834 | 0.0 | 0.0 | 0.0 | 0.0 |
| 4 | 6253 | 4000000028029388 | 24 | External | NonGLA | 15816 | 1.624593 | DFT | 2013 | CO2 | ... | 0.008685 | 0.006850 | 0.299044 | 0.165180 | 0.001292 | 0.058729 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 32 columns
df_OtherMajorRoads = pd.read_excel(xls, '2013 Other Major Rds')
print(df_OtherMajorRoads.shape)
df_OtherMajorRoads.head()
(513740, 32)
| GridId | Toid | GRID_ExactCut_ID | Location_ExactCut | BoroughName_ExactCut | DotRef | Length (m) | Emissions | Year | Pollutant | ... | Artic5Axle | Artic6Axle | PetrolCar | DieselCar | PetrolLgv | DieselLgv | LtBus | Coach | ElectricCar | ElectricLgv | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5911 | 4000000027989878 | 2 | External | NonGLA | 28440 | 9.714495 | DFT | 2013 | CO2 | ... | 3.006694 | 12.549219 | 18.791658 | 19.630267 | 0.279151 | 11.005820 | 0.000000 | 0.744254 | 0.0 | 0.0 |
| 1 | 5911 | 4000000027989880 | 2 | External | NonGLA | 28440 | 0.000000 | DFT | 2013 | CO2 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 |
| 2 | 5911 | 4000000027989882 | 2 | External | NonGLA | 57226 | 8.577192 | DFT | 2013 | CO2 | ... | 0.760333 | 2.446611 | 19.478135 | 10.300493 | 0.120149 | 7.734197 | 0.754408 | 0.868990 | 0.0 | 0.0 |
| 3 | 5911 | 4000000028014332 | 2 | External | NonGLA | 57226 | 9.347936 | DFT | 2013 | CO2 | ... | 0.823130 | 2.648621 | 20.173154 | 10.553940 | 0.123945 | 7.418739 | 0.820669 | 0.897038 | 0.0 | 0.0 |
| 4 | 5911 | 4000000027888882 | 2 | External | NonGLA | 28440 | 0.000000 | DFT | 2013 | CO2 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 |
5 rows × 32 columns
# concat the two dataframes
df_RoadEmissions = pd.concat([df_LTSroads, df_OtherMajorRoads], ignore_index=True)
print(df_RoadEmissions.shape)
df_RoadEmissions.head()
(879960, 33)
| GridId | Toid | GRID_ExactCut_ID | Location_ExactCut | BoroughName_ExactCut | Lts | Length (m) | Emissions | Year | Pollutant | ... | Artic6Axle | PetrolCar | DieselCar | PetrolLgv | DieselLgv | LtBus | Coach | ElectricCar | ElectricLgv | DotRef | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6253 | 4000000027908919 | 24 | External | NonGLA | 18898.0 | 50.761449 | DFT | 2013 | CO2 | ... | 0.190560 | 8.761443 | 4.810774 | 0.037550 | 1.735121 | 0.0 | 0.0 | 0.0 | 0.0 | NaN |
| 1 | 6253 | 4000000027947931 | 24 | External | NonGLA | 18895.0 | 28.592125 | DFT | 2013 | CO2 | ... | 0.000000 | 0.015535 | 0.008576 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | NaN |
| 2 | 6253 | 4000000028013383 | 24 | External | NonGLA | 15816.0 | 5.101391 | DFT | 2013 | CO2 | ... | 0.021509 | 0.939028 | 0.518684 | 0.004055 | 0.184415 | 0.0 | 0.0 | 0.0 | 0.0 | NaN |
| 3 | 6253 | 4000000028025820 | 24 | External | NonGLA | 15816.0 | 3.757501 | DFT | 2013 | CO2 | ... | 0.015843 | 0.691654 | 0.382044 | 0.002987 | 0.135834 | 0.0 | 0.0 | 0.0 | 0.0 | NaN |
| 4 | 6253 | 4000000028029388 | 24 | External | NonGLA | 15816.0 | 1.624593 | DFT | 2013 | CO2 | ... | 0.006850 | 0.299044 | 0.165180 | 0.001292 | 0.058729 | 0.0 | 0.0 | 0.0 | 0.0 | NaN |
5 rows × 33 columns
traffic_excel = DocZip.read('1. Supporting Information/1. Road Traffic Data/Excel/LAEI2013_2013_AADT-VKM.xlsx')
traffic_xls = pd.ExcelFile(BytesIO(traffic_excel))
df_AADT = pd.read_excel(traffic_xls, 'MajorGrid_AADTandVKM_2013')
print(df_AADT.shape)
df_AADT.head()
(87999, 44)
| RowID | Year | Toid | GRID_ExactCut_ID | Location_ExactCut | BoroughName_ExactCut | TLRN | MotorwayNumber | AADT Motorcycle | AADT Taxi | ... | VKM_Coach | VKM_Rigid2Axle | VKM_Rigid3Axle | VKM_Rigid4Axle | VKM_Artic3Axle | VKM_Artic5Axle | VKM_Artic6Axle | VKM_ElectricCar | VKM_ElectricLgv | VKM_TOTAL | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.0 | 2013.0 | 4.000000e+15 | 836.0 | Outer | Hillingdon | Other | Other | 88.301916 | 77.112580 | ... | 149.248696 | 293.680300 | 55.978941 | 39.030966 | 16.191367 | 10.970609 | 3.993946 | 4.335614 | 1.235289 | 16605.011414 |
| 1 | 2.0 | 2013.0 | 4.000000e+15 | 2217.0 | Outer | Hillingdon | Other | Other | 88.301916 | 77.112580 | ... | 98.338925 | 193.503902 | 36.884134 | 25.717231 | 10.668379 | 7.228458 | 2.631583 | 2.856706 | 0.813924 | 10940.494996 |
| 2 | 3.0 | 2013.0 | 4.000000e+15 | 282.0 | External | NonGLA | Other | Other | 310.363572 | 100.322495 | ... | 1657.075319 | 12950.212101 | 3011.364039 | 2861.551314 | 1710.809301 | 1966.897025 | 1647.110606 | 221.806380 | 47.635028 | 796735.125068 |
| 3 | 4.0 | 2013.0 | 4.000000e+15 | 873.0 | Outer | Hillingdon | Other | Other | 39.473081 | 144.548284 | ... | 118.008843 | 9777.985094 | 2051.227418 | 1024.275647 | 470.758531 | 815.631678 | 1959.389833 | 78.775616 | 15.287825 | 284144.265992 |
| 4 | 5.0 | 2013.0 | 4.000000e+15 | 2930.0 | Outer | Hillingdon | Other | Other | 39.473081 | 144.548284 | ... | 401.216526 | 33244.027352 | 6973.937855 | 3482.419671 | 1600.524988 | 2773.054115 | 6661.700602 | 267.828056 | 51.976850 | 966057.900401 |
5 rows × 44 columns
# left join df_RoadEmissions and df_AADT on the ['Toid' and 'GRID_ExactCut_ID'] column
df = pd.merge(df_RoadEmissions, df_AADT, how='left',
left_on=['Toid', 'GRID_ExactCut_ID'],
right_on=['Toid', 'GRID_ExactCut_ID'])
print(df.shape)
df.head()
(879960, 75)
| GridId | Toid | GRID_ExactCut_ID | Location_ExactCut_x | BoroughName_ExactCut_x | Lts | Length (m)_x | Emissions | Year_x | Pollutant | ... | VKM_Coach | VKM_Rigid2Axle | VKM_Rigid3Axle | VKM_Rigid4Axle | VKM_Artic3Axle | VKM_Artic5Axle | VKM_Artic6Axle | VKM_ElectricCar | VKM_ElectricLgv | VKM_TOTAL | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6253 | 4000000027908919 | 24 | External | NonGLA | 18898.0 | 50.761449 | DFT | 2013 | CO2 | ... | 0.0 | 2383.880434 | 229.558857 | 335.509098 | 194.242109 | 247.217230 | 176.583736 | 28.990862 | 5.460174 | 104036.993985 |
| 1 | 6253 | 4000000027947931 | 24 | External | NonGLA | 18895.0 | 28.592125 | DFT | 2013 | CO2 | ... | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.046589 | 0.000000 | 140.050860 |
| 2 | 6253 | 4000000028013383 | 24 | External | NonGLA | 15816.0 | 5.101391 | DFT | 2013 | CO2 | ... | 0.0 | 239.573680 | 23.070058 | 33.717777 | 19.520818 | 24.844678 | 17.746199 | 2.913505 | 0.548733 | 10455.442789 |
| 3 | 6253 | 4000000028025820 | 24 | External | NonGLA | 15816.0 | 3.757501 | DFT | 2013 | CO2 | ... | 0.0 | 176.461358 | 16.992575 | 24.835302 | 14.378333 | 18.299696 | 13.071212 | 2.145983 | 0.404177 | 7701.103189 |
| 4 | 6253 | 4000000028029388 | 24 | External | NonGLA | 15816.0 | 1.624593 | DFT | 2013 | CO2 | ... | 0.0 | 76.294807 | 7.346907 | 10.737788 | 6.216614 | 7.912054 | 5.651467 | 0.927837 | 0.174750 | 3329.647849 |
5 rows × 75 columns
# remove unnecessary columns
columnstokeep = df.columns
for a in columnstokeep:
if 'VKM' in a:
columnstokeep = columnstokeep.drop(a)
df = df[columnstokeep]
print(df.shape)
df.head()
(879960, 58)
| GridId | Toid | GRID_ExactCut_ID | Location_ExactCut_x | BoroughName_ExactCut_x | Lts | Length (m)_x | Emissions | Year_x | Pollutant | ... | AADT Rigid3Axle | AADT Rigid4Axle | AADT Artic3Axle | AADT Artic5Axle | AADT Artic6Axle | AADT ElectricCar | AADT ElectricLgv | AADT TOTAL | Speed (kph) | Length (m)_y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6253 | 4000000027908919 | 24 | External | NonGLA | 18898.0 | 50.761449 | DFT | 2013 | CO2 | ... | 12.389882 | 18.108290 | 10.483747 | 13.342950 | 9.530679 | 1.564711 | 0.29470 | 5615.144325 | 42.269546 | 50.761449 |
| 1 | 6253 | 4000000027947931 | 24 | External | NonGLA | 18895.0 | 28.592125 | DFT | 2013 | CO2 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.004464 | 0.00000 | 13.419814 | 32.236053 | 28.592125 |
| 2 | 6253 | 4000000028013383 | 24 | External | NonGLA | 15816.0 | 5.101391 | DFT | 2013 | CO2 | ... | 6.194941 | 9.054145 | 5.241873 | 6.671475 | 4.765339 | 0.782356 | 0.14735 | 2807.572163 | 35.051885 | 10.202783 |
| 3 | 6253 | 4000000028025820 | 24 | External | NonGLA | 15816.0 | 3.757501 | DFT | 2013 | CO2 | ... | 6.194941 | 9.054145 | 5.241873 | 6.671475 | 4.765339 | 0.782356 | 0.14735 | 2807.572163 | 35.051885 | 7.515003 |
| 4 | 6253 | 4000000028029388 | 24 | External | NonGLA | 15816.0 | 1.624593 | DFT | 2013 | CO2 | ... | 6.194941 | 9.054145 | 5.241873 | 6.671475 | 4.765339 | 0.782356 | 0.14735 | 2807.572163 | 35.051885 | 3.249186 |
5 rows × 58 columns
df.columns
Index(['GridId', 'Toid', 'GRID_ExactCut_ID', 'Location_ExactCut_x',
'BoroughName_ExactCut_x', 'Lts', 'Length (m)_x', 'Emissions', 'Year_x',
'Pollutant', 'Emissions Unit', 'Motorcycle', 'Taxi', 'Car',
'BusAndCoach', 'Lgv', 'Rigid', 'Artic', 'Rigid2Axle', 'Rigid3Axle',
'Rigid4Axle', 'Artic3Axle', 'Artic5Axle', 'Artic6Axle', 'PetrolCar',
'DieselCar', 'PetrolLgv', 'DieselLgv', 'LtBus', 'Coach', 'ElectricCar',
'ElectricLgv', 'DotRef', 'RowID', 'Year_y', 'Location_ExactCut_y',
'BoroughName_ExactCut_y', 'TLRN', 'MotorwayNumber', 'AADT Motorcycle',
'AADT Taxi', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv',
'AADT LtBus', 'AADT Coach', 'AADT Rigid2Axle', 'AADT Rigid3Axle',
'AADT Rigid4Axle', 'AADT Artic3Axle', 'AADT Artic5Axle',
'AADT Artic6Axle', 'AADT ElectricCar', 'AADT ElectricLgv', 'AADT TOTAL',
'Speed (kph)', 'Length (m)_y'],
dtype='object')
label = []
columnstokeep = df.columns
for a in columnstokeep:
if 'AADT' in a:
label = label + [a]
elif 'Speed' in a:
label = label + [a]
elif 'Length' in a:
label = label + [a]
label
['Length (m)_x', 'AADT Motorcycle', 'AADT Taxi', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT LtBus', 'AADT Coach', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Rigid4Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle', 'AADT ElectricCar', 'AADT ElectricLgv', 'AADT TOTAL', 'Speed (kph)', 'Length (m)_y']
# remove irrelevant columns
label.remove('AADT TOTAL')
label.remove('Speed (kph)')
label.remove('Length (m)_y')
label
['Length (m)_x', 'AADT Motorcycle', 'AADT Taxi', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT LtBus', 'AADT Coach', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Rigid4Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle', 'AADT ElectricCar', 'AADT ElectricLgv']
target = ['Motorcycle', 'Taxi', 'Car',
'BusAndCoach', 'Lgv', 'Rigid', 'Artic', 'Rigid2Axle', 'Rigid3Axle',
'Rigid4Axle', 'Artic3Axle', 'Artic5Axle', 'Artic6Axle', 'PetrolCar',
'DieselCar', 'PetrolLgv', 'DieselLgv', 'LtBus', 'Coach', 'ElectricCar',
'ElectricLgv']
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
# plot the correlation matrix
corr = df[label + target].corr()
plt.figure(figsize=(30,30))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm')
plt.show()
# sum the target variables to one column
df['Total Emissions'] = df[target].sum(axis=1)
# Split df by unique 'Pollutant' values into dictionary
df_dict = {k: v for k, v in df.groupby('Pollutant')}
df_dict.keys()
dict_keys(['CO2', 'NOx', 'PM10_Brake', 'PM10_Exhaust', 'PM10_Resusp', 'PM10_Tyre', 'PM25_Brake', 'PM25_Exhaust', 'PM25_Resusp', 'PM25_Tyre'])
for k, v in df_dict.items():
print(k, v.shape)
CO2 (87996, 59) NOx (87996, 59) PM10_Brake (87996, 59) PM10_Exhaust (87996, 59) PM10_Resusp (87996, 59) PM10_Tyre (87996, 59) PM25_Brake (87996, 59) PM25_Exhaust (87996, 59) PM25_Resusp (87996, 59) PM25_Tyre (87996, 59)
from sklearn.model_selection import train_test_split
train_test_set = {}
for k, v in df_dict.items():
x_train, x_test, y_train, y_test = train_test_split(v[label], v['Total Emissions'], test_size=0.2, random_state=42)
# drop indexes
x_train = x_train.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
train_test_set[k] = {'x_train': x_train, 'x_test': x_test, 'y_train': y_train, 'y_test': y_test}
train_test_set.keys()
dict_keys(['CO2', 'NOx', 'PM10_Brake', 'PM10_Exhaust', 'PM10_Resusp', 'PM10_Tyre', 'PM25_Brake', 'PM25_Exhaust', 'PM25_Resusp', 'PM25_Tyre'])
from sklearn.feature_selection import SelectKBest, f_regression
# feature selection
for k, v in train_test_set.items():
selector = SelectKBest(f_regression, k=10)
selector.fit(v['x_train'], v['y_train'])
v['x_train'] = selector.transform(v['x_train'])
v['x_test'] = selector.transform(v['x_test'])
v['selected_features'] = [label[i] for i in selector.get_support(indices=True)]
v['x_train'] = pd.DataFrame(v['x_train'], columns=v['selected_features'])
v['x_test'] = pd.DataFrame(v['x_test'], columns=v['selected_features'])
v['selector'] = selector
print(k, v['selected_features'])
CO2 ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle'] NOx ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle'] PM10_Brake ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle'] PM10_Exhaust ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle'] PM10_Resusp ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle'] PM10_Tyre ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle'] PM25_Brake ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle'] PM25_Exhaust ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle'] PM25_Resusp ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle'] PM25_Tyre ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle']
# rank the selected features by their scores and pvalues using selector
for k, v in train_test_set.items():
print(k)
for i in range(len(v['selected_features'])):
print(v['selected_features'][i],
v['selector'].scores_[i],
v['selector'].pvalues_[i])
CO2 Length (m)_x 23902.929633754928 0.0 AADT Pcar 1336.9976842095416 5.494646680566392e-290 AADT Dcar 582.8689772096944 2.971227847243228e-128 AADT PLgv 22267.173089016138 0.0 AADT DLgv 41915.93607963614 0.0 AADT Rigid2Axle 42142.24283493276 0.0 AADT Rigid3Axle 29262.938191929497 0.0 AADT Artic3Axle 46.29843847699099 1.0235382383240951e-11 AADT Artic5Axle 2794.4960280143246 0.0 AADT Artic6Axle 20252.94443209883 0.0 NOx Length (m)_x 25199.00406728696 0.0 AADT Pcar 1933.640411861189 0.0 AADT Dcar 953.1646645612719 6.706565319175453e-208 AADT PLgv 22384.543240906278 0.0 AADT DLgv 41900.15594616222 0.0 AADT Rigid2Axle 44386.849430008064 0.0 AADT Rigid3Axle 31181.185419332247 0.0 AADT Artic3Axle 25.324472931056917 4.857127545287284e-07 AADT Artic5Axle 3448.082301181072 0.0 AADT Artic6Axle 22482.45776086851 0.0 PM10_Brake Length (m)_x 38683.2432146443 0.0 AADT Pcar 2962.7360692582956 0.0 AADT Dcar 1620.511587746007 0.0 AADT PLgv 25840.02640642982 0.0 AADT DLgv 30801.887803789727 0.0 AADT Rigid2Axle 29519.65403221131 0.0 AADT Rigid3Axle 27152.067570247164 0.0 AADT Artic3Axle 3.2804639882439957 0.07011336638544322 AADT Artic5Axle 3599.9202903149685 0.0 AADT Artic6Axle 20742.480764929813 0.0 PM10_Exhaust Length (m)_x 21620.943593596217 0.0 AADT Pcar 1265.123428559489 1.190617668901583e-274 AADT Dcar 628.2292701958095 4.909081815610148e-138 AADT PLgv 21108.32428100092 0.0 AADT DLgv 40932.04712885946 0.0 AADT Rigid2Axle 42163.078322254085 0.0 AADT Rigid3Axle 28910.707151723433 0.0 AADT Artic3Axle 105.68849112772484 8.984237008445372e-25 AADT Artic5Axle 2763.8219202103915 0.0 AADT Artic6Axle 19334.615578500583 0.0 PM10_Resusp Length (m)_x 17394.967006344326 0.0 AADT Pcar 860.9758350795003 4.093947432063161e-188 AADT Dcar 394.84024282225107 1.2733223918480454e-87 AADT PLgv 16106.274718291455 0.0 AADT DLgv 34182.723940202915 0.0 AADT Rigid2Axle 36921.14041658401 0.0 AADT Rigid3Axle 24070.9896754166 0.0 AADT Artic3Axle 30.76573481406677 2.9217653667944158e-08 AADT Artic5Axle 2360.5218555592533 0.0 AADT Artic6Axle 18755.936341716075 0.0 PM10_Tyre Length (m)_x 27264.180259448123 0.0 AADT Pcar 1162.8543844906014 8.407983402639132e-253 AADT Dcar 478.53576055379347 1.0033082120536442e-105 AADT PLgv 24527.983872955032 0.0 AADT DLgv 44137.25820898399 0.0 AADT Rigid2Axle 42698.51136379059 0.0 AADT Rigid3Axle 30348.281981145126 0.0 AADT Artic3Axle 224.20872971314435 1.307524825797232e-50 AADT Artic5Axle 2761.8427120687165 0.0 AADT Artic6Axle 20362.10792590025 0.0 PM25_Brake Length (m)_x 39563.24242949934 0.0 AADT Pcar 2914.575934461469 0.0 AADT Dcar 1613.656445389349 0.0 AADT PLgv 25650.05050439342 0.0 AADT DLgv 31220.19802930797 0.0 AADT Rigid2Axle 29622.614375626104 0.0 AADT Rigid3Axle 26880.292563898027 0.0 AADT Artic3Axle 3.5357668733612653 0.06006268045986792 AADT Artic5Axle 3655.427613131223 0.0 AADT Artic6Axle 20555.3213518843 0.0 PM25_Exhaust Length (m)_x 21960.826750954293 0.0 AADT Pcar 1257.4778539588929 5.105697896084948e-273 AADT Dcar 597.5052107746775 2.069285930605026e-131 AADT PLgv 21293.37094202227 0.0 AADT DLgv 41605.56545776436 0.0 AADT Rigid2Axle 43043.95362958769 0.0 AADT Rigid3Axle 29398.472812733846 0.0 AADT Artic3Axle 112.02250933342329 3.696521863192289e-26 AADT Artic5Axle 2780.8012875752993 0.0 AADT Artic6Axle 19476.13679680666 0.0 PM25_Resusp Length (m)_x 17508.076847041673 0.0 AADT Pcar 843.1614225062681 2.746024706505372e-184 AADT Dcar 362.6369905521536 1.198514413961006e-80 AADT PLgv 16046.454548760841 0.0 AADT DLgv 33989.14652901631 0.0 AADT Rigid2Axle 36907.81055888567 0.0 AADT Rigid3Axle 24060.85231687838 0.0 AADT Artic3Axle 35.86461673918812 2.1253685942018337e-09 AADT Artic5Axle 2297.4932265679067 0.0 AADT Artic6Axle 18541.778199961012 0.0 PM25_Tyre Length (m)_x 27354.984411691832 0.0 AADT Pcar 1181.1029512249286 1.0559383455650056e-256 AADT Dcar 489.9179077726642 3.480721716445588e-108 AADT PLgv 24193.121503926428 0.0 AADT DLgv 44730.39588858886 0.0 AADT Rigid2Axle 43381.929381283895 0.0 AADT Rigid3Axle 30302.210534771802 0.0 AADT Artic3Axle 213.02996738096178 3.526501301225809e-48 AADT Artic5Axle 2734.7982145928704 0.0 AADT Artic6Axle 20291.496453043907 0.0
# visualize correlation matrix of selected features
for k, v in train_test_set.items():
corr = pd.DataFrame(v['x_train'], columns=v['selected_features']).corr()
# set title of plt to k
print(k)
plt.figure(figsize=(10,10))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm')
plt.show()
CO2
NOx
PM10_Brake
PM10_Exhaust
PM10_Resusp
PM10_Tyre
PM25_Brake
PM25_Exhaust
PM25_Resusp
PM25_Tyre
# standardize the df using sklearn standard scaler
from sklearn.preprocessing import StandardScaler
# stardaize the x_train and x_test
for k, v in train_test_set.items():
# initialize the scaler
scaler = StandardScaler()
# fit the scaler to the x_train
scaler.fit(v['x_train'])
# transform the x_train and x_test
v['x_train'] = scaler.transform(v['x_train'])
v['x_test'] = scaler.transform(v['x_test'])
v['x_train'] = pd.DataFrame(v['x_train'], columns=v['selected_features'])
v['x_test'] = pd.DataFrame(v['x_test'], columns=v['selected_features'])
# save the scaler
v['scaler'] = scaler
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
# model set for each pollutant
LR_set = {}
for k, v in train_test_set.items():
model = LinearRegression()
model.fit(v['x_train'], v['y_train'])
y_pred = model.predict(v['x_test'])
r2 = r2_score(v['y_test'], y_pred)
mse = mean_squared_error(v['y_test'], y_pred)
rmse = np.sqrt(mse)
LR_set[k] = {'model': model, 'y_pred': y_pred, 'mse': mse, 'r2': r2, 'rmse': rmse, 'y_test': v['y_test'], 'x_test': v['x_test'], 'x_train': v['x_train'], 'y_train': v['y_train']}
print(k, mse, r2, rmse)
CO2 156343.23707624507 0.6228061024706681 395.40262654191497 NOx 1.3337240174376739 0.5887914721059816 1.1548696971683317 PM10_Brake 0.002064996161808893 0.581905620953646 0.04544222883848121 PM10_Exhaust 0.0008067369355814339 0.5964592889324114 0.028403114892233808 PM10_Resusp 0.007830744110917233 0.5904504796613461 0.08849149174308925 PM10_Tyre 0.00028900442231542513 0.650662925495141 0.017000130067603165 PM25_Brake 0.00035156196710148805 0.566541695180498 0.01874998578936763 PM25_Exhaust 0.0005769161710396386 0.5835788811077669 0.024019079312905367 PM25_Resusp 1.0696322799024033e-05 0.5908667099465907 0.0032705233218896378 PM25_Tyre 0.00015482685366526594 0.6211541868187214 0.012442943930809379
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
# model set for each pollutant
SVR_set = {}
for k, v in train_test_set.items():
model = SVR(kernel='linear', C=1, gamma=1)
model.fit(v['x_train'], v['y_train'])
y_pred = model.predict(v['x_test'])
mse = mean_squared_error(v['y_test'], y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(v['y_test'], y_pred)
SVR_set[k] = {'model': model, 'y_pred': y_pred, 'mse': mse,
'r2': r2, 'rmse': rmse,
'y_test': v['y_test'], 'x_test': v['x_test'],
'x_train': v['x_train'], 'y_train': v['y_train']}
print(k, mse, r2, rmse)
CO2 205750.76003393464 0.5036054481910563 453.59757498683194 NOx 1.522708912356757 0.5305243947962537 1.2339809205805239 PM10_Brake 0.003126316147087352 0.36702293573558564 0.055913470175686215 PM10_Exhaust 0.004618347885348237 -1.310159988234227 0.06795842762563181 PM10_Resusp 0.010696433148599308 0.4405743562433685 0.10342356186381954 PM10_Tyre 0.005143767056453705 -5.217581450967644 0.07172006034892682 PM25_Brake 0.0008522858713561948 -0.050825810497931556 0.029193935523601385 PM25_Exhaust 0.0044828676360326725 -2.235757377850038 0.06695422044974217 PM25_Resusp 0.004265076974011152 -162.1387727815619 0.06530755679101119 PM25_Tyre 0.005459629273487326 -12.359166338511196 0.07388930418868028
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
# model set for each pollutant
DTR_set = {}
for k, v in train_test_set.items():
model = DecisionTreeRegressor(max_depth=5, random_state=42)
model.fit(v['x_train'], v['y_train'])
y_pred = model.predict(v['x_test'])
mse = mean_squared_error(v['y_test'], y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(v['y_test'], y_pred)
DTR_set[k] = {'model': model, 'y_pred': y_pred,
'mse': mse, 'r2': r2, 'rmse': rmse,
'y_test': v['y_test'], 'x_test': v['x_test'],
'x_train': v['x_train'], 'y_train': v['y_train']}
print(k, mse, r2, rmse)
CO2 26796.95069826198 0.9353496418214142 163.69774188504243 NOx 0.27815509112324527 0.9142403195476974 0.5274041060925154 PM10_Brake 0.0011160096951484228 0.7740444320758149 0.03340673128500337 PM10_Exhaust 0.00012539893120846478 0.9372737609559078 0.011198166421716761 PM10_Resusp 0.0009744026984635491 0.9490385393622965 0.031215424047472896 PM10_Tyre 5.0229914878548736e-05 0.9392840726251969 0.007087306602549994 PM25_Brake 0.00022817528094263225 0.7186713019768844 0.015105471887452978 PM25_Exhaust 9.613658983853144e-05 0.9306081050997367 0.009804926814542342 PM25_Resusp 1.4630324990932855e-06 0.9440391514863675 0.001209558803487158 PM25_Tyre 2.895337285167896e-05 0.9291539947840726 0.005380833843530105
for k, v in DTR_set.items():
fig, ax = plt.subplots(figsize=(120, 12))
plot_tree(v['model'],
ax=ax,
feature_names=v['x_train'].columns,
fontsize=12,
filled=True)
plt.show()
# Grid Search for Decision Tree Regressor
from sklearn.model_selection import GridSearchCV
for k, v in train_test_set.items():
model = DecisionTreeRegressor(random_state=42)
param_grid = {'max_depth': [
8, 9, 10, 11, 12, 13, 14, 15, 16, 17]}
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(v['x_train'], v['y_train'])
# save the best model
DTR_set[k]['best_model'] = grid_search.best_estimator_
# save the best score
DTR_set[k]['best_score'] = grid_search.best_score_
# save the best parameters
DTR_set[k]['best_params'] = grid_search.best_params_
# save best mse, rmse and r2
y_pred = grid_search.best_estimator_.predict(v['x_test'])
mse = mean_squared_error(v['y_test'], y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(v['y_test'], y_pred)
DTR_set[k]['best_mse'] = mse
DTR_set[k]['best_rmse'] = rmse
DTR_set[k]['best_r2'] = r2
# print best parameters
print(k, grid_search.best_params_, mse, r2, rmse)
CO2 {'max_depth': 14} 10247.125837086382 0.9752777708505601 101.22808818251178
NOx {'max_depth': 12} 0.15336603560590006 0.9527147888874312 0.39161975895746126
PM10_Brake {'max_depth': 10} 0.0009293047246800024 0.8118460997672856 0.030484499744624353
PM10_Exhaust {'max_depth': 14} 6.154704803656036e-05 0.969213335309999 0.007845192670454968
PM10_Resusp {'max_depth': 16} 0.0002690010521754376 0.9859311898935056 0.01640125154295969
PM10_Tyre {'max_depth': 13} 1.7422481525247925e-05 0.9789403958670143 0.0041740246196264735
PM25_Brake {'max_depth': 9} 0.0001498214702083944 0.8152776279029379 0.01224015809572713
PM25_Exhaust {'max_depth': 13} 4.137020615323757e-05 0.9701387681609122 0.006431967518049013
PM25_Resusp {'max_depth': 16} 3.915724807628567e-07 0.9850223913059635 0.0006257575255343372
PM25_Tyre {'max_depth': 14} 8.304113879230473e-06 0.9796806645562366 0.002881685943892997
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
RFR_set = {}
for k, v in train_test_set.items():
model = RandomForestRegressor(random_state=42)
model.fit(v['x_train'], v['y_train'])
y_pred = model.predict(v['x_test'])
mse = mean_squared_error(v['y_test'], y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(v['y_test'], y_pred)
RFR_set[k] = {'model': model, 'y_pred': y_pred,
'mse': mse, 'r2': r2, 'rmse': rmse,
'y_test': v['y_test'], 'x_test': v['x_test'],
'x_train': v['x_train'], 'y_train': v['y_train']}
print(k, mse, r2, rmse)
CO2 5182.412786670413 0.9874969041567384 71.98897684139158 NOx 0.0793740868329035 0.9755276946559032 0.28173407112542054 PM10_Brake 0.0006430227269420954 0.8698088680716739 0.02535789279380476 PM10_Exhaust 2.9690074433202608e-05 0.9851486237706606 0.0054488599204973705 PM10_Resusp 0.00012833779545375515 0.9932879070207226 0.011328627253721218 PM10_Tyre 1.0223574497084213e-05 0.9876421489429809 0.0031974324851487034 PM25_Brake 0.00010593440864047174 0.8693881776520503 0.010292444250054102 PM25_Exhaust 2.619579850298466e-05 0.9810917352113209 0.0051181831251904866 PM25_Resusp 1.9976005031682747e-07 0.9923591978156442 0.0004469452430855792 PM25_Tyre 4.6307604902270805e-06 0.9886689926066657 0.0021519201867697324
param_grid = {'n_estimators': [120, 150, 200]}
for k, v in train_test_set.items():
model = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(v['x_train'], v['y_train'])
# save the best model
RFR_set[k]['best_model'] = grid_search.best_estimator_
# save the best score
RFR_set[k]['best_score'] = grid_search.best_score_
# save the best parameters
RFR_set[k]['best_params'] = grid_search.best_params_
# save best mse, rmse and r2
y_pred = grid_search.best_estimator_.predict(v['x_test'])
mse = mean_squared_error(v['y_test'], y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(v['y_test'], y_pred)
RFR_set[k]['best_mse'] = mse
RFR_set[k]['best_rmse'] = rmse
RFR_set[k]['best_r2'] = r2
# print best parameters
print(k, grid_search.best_params_, mse, r2, rmse)
CO2 {'n_estimators': 200} 5181.916028565403 0.98749810263599 71.98552652141542
NOx {'n_estimators': 200} 0.07917984362339837 0.9755875829560235 0.2813891320278706
PM10_Brake {'n_estimators': 120} 0.0006479354559862355 0.86881420065431 0.02545457632698363
PM10_Exhaust {'n_estimators': 150} 3.0021769435629244e-05 0.9849827054505327 0.005479212483161175
PM10_Resusp {'n_estimators': 150} 0.00012748472250863533 0.9933325229104182 0.011290913271681585
PM10_Tyre {'n_estimators': 200} 9.870728996230191e-06 0.9880686545792372 0.0031417716333671025
PM25_Brake {'n_estimators': 200} 0.00010477083756667516 0.8708228025329586 0.010235762676355639
PM25_Exhaust {'n_estimators': 200} 2.6032444649002596e-05 0.9812096448801173 0.005102199981282838
PM25_Resusp {'n_estimators': 200} 1.982879971088954e-07 0.9924155037053795 0.0004452954043204302
PM25_Tyre {'n_estimators': 200} 4.686965799321414e-06 0.9885314638413071 0.0021649401375838117